library(qtlcharts)
library(CountClust)
library(parallel)
library(cellcycleR)
library(data.table)
library(binhf)
library(vioplot)
library(limma)
library(readxl)
library(Humanzee)
library(mygene)
library(knitr)
After fitting sinusoidal model to Gilad2015 iPSC NA19239 (link1, we fitted PCA to the same data and compare fitting results.
We fit the model under the scenarios of 1) all annotated cell-cycle genes, 2) high SNR from cell-cycle genes, 3) best set from the high-SNR genes. In all scenarios, PCA performs poorly…
molecules_final <- read.table("../data/gilad-2015/molecules-final.txt",
header = TRUE,
stringsAsFactors = FALSE)
anno_filter <- read.table("../data/gilad-2015/annotation-filter.txt",
header = TRUE,
stringsAsFactors = FALSE)
dim(molecules_final)
## [1] 12192 560
table(anno_filter$individual)
##
## NA19098 NA19101 NA19239
## 141 199 220
Extract only data of one individual
molecules_final_subset <- molecules_final[ , anno_filter$individual == "NA19239"]
dim(molecules_final_subset)
## [1] 12192 220
molecules_final_subset <- molecules_final_subset
Import cell-cycle genes
cellcycle_genes <- read.table("../data/gilad-2015/cellcyclegenes.txt",
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE)
colnames(cellcycle_genes) <- c("G1.S","S","G2","G2.M","M.G1")
Extract only cell-cycle genes
which_cell_cycle <- which (rownames(molecules_final_subset) %in% unlist(cellcycle_genes))
cycle_data <- t(molecules_final_subset[which_cell_cycle, ])
dim(cycle_data)
## [1] 220 536
Standardize expression levels into z-scores for for gene. This step is for easy visualization of expression levels in plots.
cycle_data_normed <- apply(cycle_data, 2,
function(x) return (x-mean(x))/sd(x))
if (file.exists("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28.rda")) {
load("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28.rda")
} else {
results <- sin_cell_ordering_class(cycle_data_normed,
celltime_levels = 300,
num_iter = 300)
save(results,
file ="../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28.rda")
}
str(results)
## List of 6
## $ cell_times : num [1:220] 5.758 2.753 4.287 0.357 3.257 ...
## $ amp : num [1:536] 0.0689 0.256 0.1392 0.0721 0.3354 ...
## $ phi : num [1:536] 1.312 0.746 1.426 1.245 1.873 ...
## $ sigma : num [1:536] 0.434 0.828 0.647 0.546 1.456 ...
## $ loglik : num -148111
## $ signal_intensity: num [1:220, 1:300] -698 -781 -687 -667 -859 ...
Post-processing of cell-phase order.
cell_order_full <- cell_ordering_full(results$signal_intensity, dim(molecules_final)[2])
str(cell_order_full)
## num [1:220(1d)] 5.625 2.732 4.413 0.226 2.972 ...
We look at the plots of the amplitudes, phases and the non signal variation of the genes.
amp_genes <- results$amp
sd_genes <- results$sigma
phi_genes <- results$phi
par(mfrow=c(2,2))
plot(density(phi_genes), col="red",
main="Density plot of the phases")
plot(density(amp_genes[-which.max(amp_genes)]), col="red",
main="Density plot of the amplitudes")
plot(density(sd_genes[-which.max(sd_genes)]), col="red",
main="Density plot of the non-signal sd")
ESS <- amp_genes^2
RSS <- sd_genes^2
SNR <- ESS/RSS
plot(SNR, col="red", pch=20, lwd=1)
After re-ordering.
new_cell_order <- binhf::shift(order(cell_order_full), 130, dir = "right")
iplotCurves(t(cycle_data_normed[new_cell_order, ]) )
## Set screen size to height=700 x width=1000
pca_cycle <- prcomp(cycle_data_normed[new_cell_order, ], center=TRUE, scale. = TRUE)
par(mfrow= c(2,2))
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
plot(pca_cycle$x[,1], pca_cycle$x[,3], pch=20, lwd=0.01)
plot(pca_cycle$x[,2], pca_cycle$x[,3], pch=20, lwd=0.01)
We extract the high SNR genes as they seem to have sinusoidal patterns and repeat the procedure again.
snr_high_indices <- which(SNR > .1)
cycle_data_normed_high_snr <- cycle_data_normed[ ,snr_high_indices]
dim(cycle_data_normed_high_snr)
## [1] 220 60
Modeing fitting…
if (file.exists("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr.rda")) {
load("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr.rda")
} else {
results_high_snr <- sin_cell_ordering_class(cycle_data_normed_high_snr,
celltime_levels = 300, num_iter=200)
save(results_high_snr,
file = "../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr.rda")
}
Post-processing
cell_order_full <- cell_ordering_full(results_high_snr$signal_intensity,
dim(cycle_data_normed_high_snr)[2])
We plot the same features as above and check for the robustness. We needed to shift the cell order so as to compare with previous plot on all genes as the method is non-identifiable upto a rotation.
amp_genes <- results_high_snr$amp;
sd_genes <- results_high_snr$sigma;
phi_genes <- results_high_snr$phi;
par(mfrow = c(2,2))
plot(density(phi_genes), col="red", main="Density plot of the phases")
plot(density(amp_genes[-which.max(amp_genes)]), col="red", main="Density plot of the amplitudes")
plot(density(sd_genes[-which.max(sd_genes)]), col="red", main="Density plot of the non-signal sd")
ESS <- amp_genes^2; RSS <- sd_genes^2
SNR <- ESS/RSS;
plot(SNR, col="red", pch=20, lwd=1)
new_cell_order <- binhf::shift(order(cell_order_full), 130, dir = "right")
iplotCurves(t(cycle_data_normed_high_snr[new_cell_order, ]) )
pca_cycle <- prcomp(cycle_data_normed_high_snr[new_cell_order, ],
center=TRUE, scale. = TRUE);
par(mfrow= c(2,2))
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
plot(pca_cycle$x[,1], pca_cycle$x[,3], pch=20, lwd=0.01)
plot(pca_cycle$x[,2], pca_cycle$x[,3], pch=20, lwd=0.01)
snr_high_indices <- which(SNR > .5)
cycle_data_normed_high_snr_high <- cycle_data_normed_high_snr[ ,snr_high_indices]
dim(cycle_data_normed_high_snr_high)
## [1] 220 18
Modeing fitting…
if (file.exists("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr-high.rda")) {
load("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr-high.rda")
} else {
results_high_snr_high <- sin_cell_ordering_class(cycle_data_normed_high_snr_high,
celltime_levels = 300, num_iter=200)
save(results_high_snr_high,
file = "../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr0high.rda")
}
## Final loglikelihood, iter 200 : -546.854341498
Post-processing
cell_order_full <- cell_ordering_full(results_high_snr_high$signal_intensity,
dim(cycle_data_normed_high_snr)[2])
We plot the same features as above and check for the robustness. We needed to shift the cell order so as to compare with previous plot on all genes as the method is non-identifiable upto a rotation.
amp_genes <- results_high_snr_high$amp;
sd_genes <- results_high_snr_high$sigma;
phi_genes <- results_high_snr_high$phi;
par(mfrow = c(2,2))
plot(density(phi_genes), col="red", main="Density plot of the phases")
plot(density(amp_genes[-which.max(amp_genes)]), col="red", main="Density plot of the amplitudes")
plot(density(sd_genes[-which.max(sd_genes)]), col="red", main="Density plot of the non-signal sd")
ESS <- amp_genes^2; RSS <- sd_genes^2
SNR <- ESS/RSS;
plot(SNR, col="red", pch=20, lwd=1)
new_cell_order <- binhf::shift(order(cell_order_full), 130, dir = "right")
iplotCurves(t(cycle_data_normed_high_snr_high[new_cell_order, ]) )
pca_cycle <- prcomp(cycle_data_normed_high_snr_high[new_cell_order, ],
center=TRUE, scale. = TRUE);
par(mfrow= c(2,2))
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
plot(pca_cycle$x[,1], pca_cycle$x[,3], pch=20, lwd=0.01)
plot(pca_cycle$x[,2], pca_cycle$x[,3], pch=20, lwd=0.01)
sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.11 mygene_1.2.3 GenomicFeatures_1.20.6
## [4] AnnotationDbi_1.30.1 Biobase_2.28.0 GenomicRanges_1.20.8
## [7] GenomeInfoDb_1.4.3 IRanges_2.2.9 S4Vectors_0.6.6
## [10] BiocGenerics_0.14.0 Humanzee_0.1.0 readxl_0.1.0
## [13] limma_3.24.15 vioplot_0.2 sm_2.2-5.4
## [16] binhf_1.0-1 adlift_1.3-2 EbayesThresh_1.3.2
## [19] wavethresh_4.6.6 MASS_7.3-45 data.table_1.9.6
## [22] cellcycleR_0.1.1 CountClust_0.1.0 qtlcharts_0.5-25
##
## loaded via a namespace (and not attached):
## [1] httr_1.0.0 jsonlite_0.9.19
## [3] splines_3.2.1 gsubfn_0.6-6
## [5] Formula_1.2-1 latticeExtra_0.6-26
## [7] Rsamtools_1.20.5 yaml_2.1.13
## [9] RSQLite_1.0.0 lattice_0.20-33
## [11] chron_2.3-47 digest_0.6.8
## [13] RColorBrewer_1.1-2 XVector_0.8.0
## [15] colorspace_1.2-6 htmltools_0.3
## [17] plyr_1.8.3 XML_3.98-1.3
## [19] biomaRt_2.24.1 zlibbioc_1.14.0
## [21] scales_0.3.0 BiocParallel_1.2.22
## [23] sqldf_0.4-10 ggplot2_2.0.0
## [25] nnet_7.3-11 proto_0.3-10
## [27] survival_2.38-3 magrittr_1.5
## [29] evaluate_0.8 foreign_0.8-66
## [31] tools_3.2.1 formatR_1.2.1
## [33] stringr_1.0.0 munsell_0.4.2
## [35] cluster_2.0.3 lambda.r_1.1.7
## [37] Biostrings_2.36.4 futile.logger_1.4.1
## [39] grid_3.2.1 RCurl_1.95-4.7
## [41] htmlwidgets_0.5 bitops_1.0-6
## [43] rmarkdown_0.9.2 gtable_0.1.2
## [45] DBI_0.3.1 R6_2.1.1
## [47] GenomicAlignments_1.4.2 gridExtra_2.0.0
## [49] rtracklayer_1.28.10 Hmisc_3.17-1
## [51] futile.options_1.0.0 stringi_1.0-1
## [53] Rcpp_0.12.2 rpart_4.1-10
## [55] acepack_1.3-3.3